This document will allow you to reproduce all supplementary figures.
Set working directory and load all necessary libraries and functions.
# If not installed, install and load the package "here", else: only load the package.
err <- try(library("here", character.only = TRUE), silent = TRUE)
## here() starts at /Users/lgoeminn/Documents/phD/MSqRobHurdlePaper
if (class(err) == 'try-error') {
install.packages("here", repos = "https://cloud.r-project.org")
library("here", character.only = TRUE)
}
wd <- here()
# Optional: change the working directory
setwd(wd)
# Load all functions and libraries
source(paste0(wd, "/R/functions.r"))
## Loading required namespace: BiocManager
## Registered S3 methods overwritten by 'ggplot2':
## method from
## [.quosures rlang
## c.quosures rlang
## print.quosures rlang
## ── Attaching packages ─────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.1 ✔ purrr 0.3.2
## ✔ tibble 2.1.3 ✔ dplyr 0.8.1
## ✔ tidyr 0.8.3 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.4.0
## ── Conflicts ────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:dplyr':
##
## combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind,
## colMeans, colnames, colSums, dirname, do.call, duplicated,
## eval, evalq, Filter, Find, get, grep, grepl, intersect,
## is.unsorted, lapply, lengths, Map, mapply, match, mget, order,
## paste, pmax, pmax.int, pmin, pmin.int, Position, rank, rbind,
## Reduce, rowMeans, rownames, rowSums, sapply, setdiff, sort,
## table, tapply, union, unique, unsplit, which, which.max,
## which.min
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: mzR
## Loading required package: Rcpp
## Warning in fun(libname, pkgname): mzR has been built against a different Rcpp version (1.0.0)
## than is installed on your system (1.0.1). This might lead to errors
## when loading mzR. If you encounter such issues, please send a report,
## including the output of sessionInfo() to the Bioc support forum at
## https://support.bioconductor.org/. For details see also
## https://github.com/sneumann/mzR/wiki/mzR-Rcpp-compiler-linker-issue.
## Loading required package: S4Vectors
## Warning: package 'S4Vectors' was built under R version 3.6.1
## Loading required package: stats4
## Warning: multiple methods tables found for 'lengths'
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:dplyr':
##
## first, rename
## The following object is masked from 'package:tidyr':
##
## expand
## The following object is masked from 'package:base':
##
## expand.grid
## Loading required package: ProtGenerics
##
## This is MSnbase version 2.8.3
## Visit https://lgatto.github.io/MSnbase/ to get started.
##
## Attaching package: 'MSnbase'
## The following object is masked from 'package:stats':
##
## smooth
## The following object is masked from 'package:base':
##
## trimws
##
## Attaching package: 'limma'
## The following object is masked from 'package:BiocGenerics':
##
## plotMA
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following object is masked from 'package:S4Vectors':
##
## expand
## The following object is masked from 'package:tidyr':
##
## expand
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: future
##
## Attaching package: 'future'
## The following object is masked from 'package:survival':
##
## cluster
## The following object is masked from 'package:S4Vectors':
##
## values
## Loading required package: SummarizedExperiment
## Warning: package 'SummarizedExperiment' was built under R version 3.6.1
## Loading required package: GenomicRanges
## Warning: package 'GenomicRanges' was built under R version 3.6.1
## Loading required package: IRanges
## Warning: package 'IRanges' was built under R version 3.6.1
##
## Attaching package: 'IRanges'
## The following objects are masked from 'package:dplyr':
##
## collapse, desc, slice
## The following object is masked from 'package:purrr':
##
## reduce
## Loading required package: GenomeInfoDb
## Warning: no function found corresponding to methods exports from 'XVector'
## for: 'concatenateObjects'
## Loading required package: DelayedArray
## Loading required package: matrixStats
##
## Attaching package: 'matrixStats'
## The following objects are masked from 'package:Biobase':
##
## anyMissing, rowMedians
## The following object is masked from 'package:dplyr':
##
## count
## Loading required package: BiocParallel
## Warning: multiple methods tables found for 'lengths'
##
## Attaching package: 'DelayedArray'
## The following objects are masked from 'package:matrixStats':
##
## colMaxs, colMins, colRanges, rowMaxs, rowMins, rowRanges
## The following object is masked from 'package:purrr':
##
## simplify
## The following objects are masked from 'package:base':
##
## aperm, apply
##
## Attaching package: 'stageR'
## The following object is masked from 'package:methods':
##
## getMethod
source(paste0(wd,"/R/plot_functions.R"))
Give session info for reproducibility.
sessionInfo()
## R version 3.6.0 (2019-04-26)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Mojave 10.14.6
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats4 parallel stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] proDA_1.1.2 msqrobsum_0.0.0.9000
## [3] MSqRob_0.7.6 stageR_1.3.29
## [5] SummarizedExperiment_1.14.1 DelayedArray_0.6.0
## [7] BiocParallel_1.14.2 matrixStats_0.54.0
## [9] GenomicRanges_1.36.1 GenomeInfoDb_1.16.0
## [11] IRanges_2.18.3 edgeR_3.24.3
## [13] furrr_0.1.0.9002 future_1.13.0
## [15] readxl_1.3.1 colorspace_1.4-1
## [17] zoo_1.8-6 corpcor_1.6.9
## [19] survival_2.44-1.1 lme4_1.1-21
## [21] Matrix_1.2-17 limma_3.36.5
## [23] MSnbase_2.8.3 ProtGenerics_1.12.0
## [25] S4Vectors_0.22.1 mzR_2.16.1
## [27] Rcpp_1.0.1 Biobase_2.40.0
## [29] BiocGenerics_0.26.0 forcats_0.4.0
## [31] stringr_1.4.0 dplyr_0.8.1
## [33] purrr_0.3.2 readr_1.3.1
## [35] tidyr_0.8.3 tibble_2.1.3
## [37] ggplot2_3.1.1 tidyverse_1.2.1
## [39] MSstats_3.12.2 usethis_1.5.0
## [41] devtools_2.0.2 remotes_2.0.4
## [43] preprocessCore_1.42.0 here_0.1
##
## loaded via a namespace (and not attached):
## [1] snow_0.4-3 backports_1.1.4 plyr_1.8.4
## [4] lazyeval_0.2.2 splines_3.6.0 listenv_0.7.0
## [7] digest_0.6.19 foreach_1.4.4 BiocInstaller_1.30.0
## [10] htmltools_0.3.6 gdata_2.18.0 magrittr_1.5
## [13] memoise_1.1.0 doParallel_1.0.14 globals_0.12.4
## [16] modelr_0.1.4 prettyunits_1.0.2 rvest_0.3.4
## [19] ggrepel_0.8.1 haven_2.1.0 xfun_0.7
## [22] callr_3.2.0 crayon_1.3.4 RCurl_1.95-4.12
## [25] jsonlite_1.6 impute_1.54.0 iterators_1.0.10
## [28] glue_1.3.1 gtable_0.3.0 zlibbioc_1.26.0
## [31] XVector_0.20.0 pkgbuild_1.0.3 future.apply_1.3.0
## [34] scales_1.0.0 vsn_3.48.1 httr_1.4.0
## [37] gplots_3.0.1.1 pkgconfig_2.0.2 XML_3.98-1.20
## [40] locfit_1.5-9.1 tidyselect_0.2.5 rlang_0.4.2
## [43] reshape2_1.4.3 munsell_0.5.0 cellranger_1.1.0
## [46] tools_3.6.0 cli_1.1.0 generics_0.0.2
## [49] broom_0.5.2 evaluate_0.14 mzID_1.18.0
## [52] yaml_2.2.0 processx_3.3.1 knitr_1.23
## [55] fs_1.3.1 caTools_1.17.1.2 randomForest_4.6-14
## [58] ncdf4_1.16.1 nlme_3.1-140 xml2_1.2.0
## [61] compiler_3.6.0 rstudioapi_0.10 testthat_2.1.1
## [64] affyio_1.50.0 marray_1.58.0 stringi_1.4.3
## [67] ps_1.3.0 desc_1.2.0 lattice_0.20-38
## [70] nloptr_1.2.1 pillar_1.4.1 BiocManager_1.30.4
## [73] MALDIquant_1.19.3 data.table_1.12.2 bitops_1.0-6
## [76] R6_2.4.0 pcaMethods_1.72.0 affy_1.58.0
## [79] KernSmooth_2.23-15 sessioninfo_1.1.1 codetools_0.2-16
## [82] boot_1.3-22 MASS_7.3-51.4 gtools_3.8.1
## [85] assertthat_0.2.1 pkgload_1.0.2 rprojroot_1.3-2
## [88] minpack.lm_1.2-1 withr_2.1.2 GenomeInfoDbData_1.1.0
## [91] doSNOW_1.0.16 hms_0.4.2 grid_3.6.0
## [94] minqa_1.2.4 rmarkdown_1.13 lubridate_1.7.4
Important: make sure you have run the CPTAC and HEART analyses, or load the necessary files as follows:
load(paste0(wd, "/save_files_CPTAC/peptides_CPTAC2.RData"))
load(paste0(wd, "/save_files_CPTAC/peptides_CPTAC_KNN1.RData"))
load(paste0(wd, "/save_files_CPTAC/peptides_CPTAC_Perseus_imp.RData"))
load(paste0(wd, "/save_files_CPTAC/peptides_CPTAC_mixed.RData"))
load(paste0(wd, "/save_files_CPTAC/peptides_CPTAC_RegEM_imp.RData"))
load(paste0(wd, "/save_files_CPTAC/res_CPTAC_full.RData"))
load(paste0(wd, "/save_files_CPTAC/res_CPTAC_co_full.RData"))
load(paste0(wd, "/save_files_CPTAC/res_CPTAC_sc_co_full.RData"))
load(paste0(wd, "/save_files_CPTAC/res_CPTAC_edgeR_full.RData"))
load(paste0(wd, "/save_files_CPTAC/res_CPTAC_edgeR_sc_full.RData"))
load(paste0(wd, "/save_files_CPTAC/res_CPTAC_beta_binom_full.RData"))
load(paste0(wd, "/save_files_CPTAC/res_CPTAC_beta_binom_sc_full.RData"))
load(paste0(wd, "/save_files_CPTAC/res_CPTAC_Hurdle.RData"))
load(paste0(wd, "/save_files_CPTAC/res_CPTAC_PI_full.RData"))
load(paste0(wd, "/save_files_CPTAC/res_CPTAC_KNN1_full.RData"))
load(paste0(wd, "/save_files_CPTAC/res_CPTAC_RegEM_full.RData"))
load(paste0(wd, "/save_files_CPTAC/res_CPTAC_mixed_full.RData"))
load(paste0(wd, "/save_files_CPTAC/res_CPTAC_ProDA_full.RData"))
load(paste0(wd, "/save_files_CPTAC/res_CPTAC_ProPCA_full_QN_int.RData"))
load(paste0(wd, "/save_files_CPTAC/res_CPTAC_mixture.RData"))
load(paste0(wd, "/save_files_CPTAC/res_CPTAC_DEP.RData"))
load(paste0(wd, "/save_files_CPTAC/res_CPTAC_DEP_noimp.RData"))
load(paste0(wd, "/save_files_CPTAC/res_CPTAC_MSstats_full.RData"))
load(paste0(wd, "/save_files_CPTAC/res_CPTAC_MSstats_noimp_full.RData"))
load(paste0(wd, "/save_files_CPTAC/CPTAC_Perseus_full.RData"))
load(paste0(wd, "/save_files_CPTAC/CPTAC_Perseus_imp_full.RData"))
load(paste0(wd, "/save_files_PXD006675/peptides_HEART2.RData"))
load(paste0(wd, "/save_files_PXD006675/res_HEART_Hurdle.RData"))
load(paste0(wd, "/save_files_PXD006675/overlap_PHM.RData"))
Here, we will examine the amount of missing values in 105 peptides.txt files originating from 79 PRIDE projects searched with MaxQuant and published in the PRIDE Archive repository in 2017. All of these projects either investigate the full proteome of an organism, or a subpart of it (e.g. the proteome of a certain organelle, all chromatin-binding proteins, etc.).
Since many of the source files are too big, we cannot put them on the GitHub server, however, they can be downloaded freely from the PRIDE Archive repository using their corresponding identifier. We also provide the file “missingness_PRIDE.RData” which contains the amounts of missing values for each file and can easily be loaded into R.
Warning: this option might take some time, as for quite some PRIDE projects, the peptides.txt files are hidden inside zip files that are several GB large. Alternatively, you could download a few of these files and compare their numbers of missing values to the table in supplementary material to confirm that our analysis is correct.
# # 1. Step 1: download the peptides.txt files for all 79 PRIDE projects to a folder "PRIDE_projects" in the working directory.
#
# # 2. Step 2: set the path to each peptides.txt file.
#
# filepaths <- c(paste0(wd, "/PRIDE_projects/PXD006064/peptides.txt"),
# paste0(wd, "/PRIDE_projects/PXD004436/peptides.txt"),
# paste0(wd, "/PRIDE_projects/PXD000514/MaxQuantOutput/peptides.txt"),
# paste0(wd, "/PRIDE_projects/PXD005353/peptides.txt"),
# paste0(wd, "/PRIDE_projects/PXD006708/ACE_0056_peptides.txt"),
# paste0(wd, "/PRIDE_projects/PXD006708/ACE_0245_1-6_13-16_peptides.txt"),
# paste0(wd, "/PRIDE_projects/PXD006708/ACE_0245_7-12_13-16_peptides.txt"),
# paste0(wd, "/PRIDE_projects/PXD006708/ACE_0245_17-22_peptides.txt"),
# paste0(wd, "/PRIDE_projects/PXD006323/peptides.txt"),
# paste0(wd, "/PRIDE_projects/PXD005880/peptides2.txt"),
# paste0(wd, "/PRIDE_projects/PXD007259/Proteomics_NCTC\ 8325-4_dclpX/peptides.txt"),
# paste0(wd, "/PRIDE_projects/PXD007259/Secretome\ Titration_NCTC\ 8325/peptides.txt"),
# paste0(wd, "/PRIDE_projects/PXD005224/peptides.txt"),
# paste0(wd, "/PRIDE_projects/PXD005610/peptides.txt"),
# paste0(wd, "/PRIDE_projects/PXD006879/peptides.txt"),
# paste0(wd, "/PRIDE_projects/PXD006870/LEAVES/peptides.txt"),
# paste0(wd, "/PRIDE_projects/PXD006870/ROOTS/peptides.txt"),
# paste0(wd, "/PRIDE_projects/PXD006171/Non_phospho/peptides.txt"),
# paste0(wd, "/PRIDE_projects/PXD005971/peptides.txt"),
# paste0(wd, "/PRIDE_projects/PXD006835/UK4_Fap/peptides.txt"),
# paste0(wd, "/PRIDE_projects/PXD006835/EColi_Curli/peptides.txt"),
# paste0(wd, "/PRIDE_projects/PXD004420/peptides.txt"),
# paste0(wd, "/PRIDE_projects/PXD006486/txt_IsoSeq/peptides.txt"),
# paste0(wd, "/PRIDE_projects/PXD006486/txt_siSF3B1/peptides.txt"),
# paste0(wd, "/PRIDE_projects/PXD006486/txt_siSRSF1/peptides.txt"),
# paste0(wd, "/PRIDE_projects/PXD003395/peptides.txt"),
# paste0(wd, "/PRIDE_projects/PXD004094/MaxQuant_Output_P18/peptides.txt"),
# paste0(wd, "/PRIDE_projects/PXD004094/MaxQuant_Output_P24/peptides.txt"),
# paste0(wd, "/PRIDE_projects/PXD004094/MaxQuant_Output_P30/peptides.txt"),
# paste0(wd, "/PRIDE_projects/PXD004094/MaxQuant_Output_P36/peptides.txt"),
# paste0(wd, "/PRIDE_projects/PXD005929/20150821_txt/peptides.txt"),
# paste0(wd, "/PRIDE_projects/PXD005929/20151203_txt/peptides.txt"),
# paste0(wd, "/PRIDE_projects/PXD005929/20151209_txt/peptides.txt"),
# paste0(wd, "/PRIDE_projects/PXD006596/peptides.txt"),
# paste0(wd, "/PRIDE_projects/PXD004075/peptides.txt"),
# paste0(wd, "/PRIDE_projects/PXD004076/peptides.txt"),
# paste0(wd, "/PRIDE_projects/PXD005574/peptides.txt"),
# paste0(wd, "/PRIDE_projects/PXD006888/peptides.txt"),
# paste0(wd, "/PRIDE_projects/PXD006531/peptides.txt"),
# paste0(wd, "/PRIDE_projects/PXD006103/peptides.txt"),
# paste0(wd, "/PRIDE_projects/PXD005442/peptides.txt"),
# paste0(wd, "/PRIDE_projects/PXD004538/peptides.txt"),
# paste0(wd, "/PRIDE_projects/PXD006128/peptides.txt"),
# paste0(wd, "/PRIDE_projects/PXD005559/peptides.txt"),
# paste0(wd, "/PRIDE_projects/PXD005035/MQresults_EDL_SOL/peptides.txt"),
# paste0(wd, "/PRIDE_projects/PXD005035/MQresults_Vastus/peptides.txt"),
# paste0(wd, "/PRIDE_projects/PXD004421/peptides.txt"),
# paste0(wd, "/PRIDE_projects/PXD006302/peptides.txt"),
# paste0(wd, "/PRIDE_projects/PXD006314/peptides.txt"),
# paste0(wd, "/PRIDE_projects/PXD006332/peptides.txt"),
# paste0(wd, "/PRIDE_projects/PXD003864/peptides.txt"),
# paste0(wd, "/PRIDE_projects/PXD003872/peptides.txt"),
# paste0(wd, "/PRIDE_projects/PXD005182/peptides.txt"),
# paste0(wd, "/PRIDE_projects/PXD006251/peptides.txt"),
# paste0(wd, "/PRIDE_projects/PXD003260/peptides.txt"),
# paste0(wd, "/PRIDE_projects/PXD003531/PRIMARY/peptides.txt"),
# paste0(wd, "/PRIDE_projects/PXD005437/Proteome_analysis/peptides.txt"),
# paste0(wd, "/PRIDE_projects/PXD004888/peptidesFn.txt"),
# paste0(wd, "/PRIDE_projects/PXD004888/peptidesPg.txt"),
# paste0(wd, "/PRIDE_projects/PXD004304/peptides.txt"),
# paste0(wd, "/PRIDE_projects/PXD005861/peptides.txt"),
# paste0(wd, "/PRIDE_projects/PXD005532/MQ\ result\ IB1-20/peptides.txt"),
# paste0(wd, "/PRIDE_projects/PXD005532/MQ\ result\ IB21-30/peptides.txt"),
# paste0(wd, "/PRIDE_projects/PXD005532/MQ\ result\ IB31-45/peptides.txt"),
# paste0(wd, "/PRIDE_projects/PXD005402/peptides.txt"),
# paste0(wd, "/PRIDE_projects/PXD005286/peptides.txt"),
# paste0(wd, "/PRIDE_projects/PXD004730/peptides.txt"),
# paste0(wd, "/PRIDE_projects/PXD005467/Host\ chromatin\ bound\ proteome/peptides.txt"),
# paste0(wd, "/PRIDE_projects/PXD005467/HSV_1_chromatin_bound_proteome/peptides.txt"),
# paste0(wd, "/PRIDE_projects/PXD005467/Time\ course\ -\ HSV-1\ proteome/peptides.txt"),
# paste0(wd, "/PRIDE_projects/PXD004273/peptides.txt"),
# paste0(wd, "/PRIDE_projects/PXD004816/peptides.txt"),
# paste0(wd, "/PRIDE_projects/PXD003632/peptides.txt"),
# paste0(wd, "/PRIDE_projects/PXD005146/peptides.txt"),
# paste0(wd, "/PRIDE_projects/PXD003883/peptides.txt"),
# paste0(wd, "/PRIDE_projects/PXD005305/peptides.txt"),
# paste0(wd, "/PRIDE_projects/PXD006129/peptides.txt"),
# paste0(wd, "/PRIDE_projects/PXD007533/peptides.txt"),
# paste0(wd, "/PRIDE_projects/PXD005833/peptides.txt"),
# paste0(wd, "/PRIDE_projects/PXD007034/peptides.txt"),
# paste0(wd, "/PRIDE_projects/PXD008380/peptides.txt"),
# paste0(wd, "/PRIDE_projects/PXD005587/peptides-Control\ KP.txt"),
# paste0(wd, "/PRIDE_projects/PXD005587/peptides-Doxy\ Treated\ KP.txt"),
# paste0(wd, "/PRIDE_projects/PXD005587/peptides-SM\ treated\ KP.txt"),
# paste0(wd, "/PRIDE_projects/PXD006752/peptides.txt"),
# paste0(wd, "/PRIDE_projects/PXD006139/H2228_drug_treatment_LFQ/peptides.txt"),
# paste0(wd, "/PRIDE_projects/PXD006139/mouse_kidney_trametinib/peptides.txt"),
# paste0(wd, "/PRIDE_projects/PXD005259/peptides.txt"),
# paste0(wd, "/PRIDE_projects/PXD006914/peptides_normal.txt"),
# paste0(wd, "/PRIDE_projects/PXD006914/peptides_xman.txt"),
# paste0(wd, "/PRIDE_projects/PXD004555/peptides.txt"),
# paste0(wd, "/PRIDE_projects/PXD005736/peptides.txt"),
# paste0(wd, "/PRIDE_projects/PXD001639/peptides.txt"),
# paste0(wd, "/PRIDE_projects/PXD006321/peptides.txt"),
# paste0(wd, "/PRIDE_projects/PXD007567/peptides.txt"),
# paste0(wd, "/PRIDE_projects/PXD007725/peptides.txt"))
#
# # Note that the PRIDE identifiers can be easily extracted from these filepaths using the "substr" function.
#
# unique(substr(filepaths,64,72)) # 64 and 72 might be changed depending on how long the path to your working directory is
#
# unique(substr(filepaths,64,72)) %>% length
# # 73 projects
# filepaths %>% length
# # 96 peptides.txt files
#
# # 3. Execute the counting for the number of missing and observed values. This might take a short while.
#
# n_missing <- rep(NA, length(filepaths))
# n_tot <- rep(NA, length(filepaths))
#
# for(i in 1:length(n_tot)){
#
# test <- read.table(filepaths[i], sep="\t", quote="", comment.char = "", header=TRUE)
# test2 <- test[,grepl("Intensity.",colnames(test))] == 0
#
# n_missing[i] <- sum(test2)
# n_tot[i] <- length(test2)
#
# }
#
# perc_missingness <- n_missing/n_tot
# n_observed <- n_tot-n_missing
# Import the number of observed and missing values for each peptides.txt file.
load(paste0(wd,"/save_files_missingness_PRIDE/missingness_PRIDE.RData"))
mean(perc_missingness)
## [1] 0.435468
# 0.435468
median(perc_missingness)
## [1] 0.4354536
# 0.4354536
range(perc_missingness)
## [1] 0.1581222 0.8211029
# 0.1581222 0.8211029
# Make the histogram
cex.main <- 1.2
cex.lab <- 1
cex <- 1
# #Plotting this plot in SVG:
# grDevices::svg(paste0(wd,"/missing_imp.svg"), width=10, height=10)
# cex.main <- 3.5
# cex.lab <- 3
# cex <- 2.5
# par(mar=c(5.1*cex,4.1*cex+3,4.1*cex,2.1*cex+7))
# Histogram of the missing values (Fig. 1 a)
hist(perc_missingness, las = 1, lty="blank", col = "#7C7C7C", xlim = c(0,1), main = "% missing values in 96 searches", xlab = "% missing values", xaxt="n", cex.main = cex.main, cex.lab = cex.lab, cex = cex, cex.axis = cex)
axis(1, at=seq(0, 1, by=0.2), labels=c("0", "20", "40","60", "80", "100"), cex = cex, cex.lab = cex.lab, cex.axis = cex)
# par(mfrow=c(1,1))
# par(mar=c(5.1,4.1,4.1,2.1))
#
# # End SVG plot
# dev.off()
# Import non-imputed data
file.proteinGroups <- paste0(wd,"/datasets/CPTAC/proteinGroups_preprocessed.txt")
exprs_col <- grepEcols(file.proteinGroups, "LFQ.intensity.", split = "\t")
proteinGroups.CPTAC <- readMSnSet2(file.proteinGroups, ecol = exprs_col, fnames = c("Protein.IDs"), sep = "\t")
sampleNames(proteinGroups.CPTAC) <- str_replace(sampleNames(proteinGroups.CPTAC), "LFQ.intensity.", "") %>% make.names
pData(proteinGroups.CPTAC) <- pData(peptides.CPTAC2)
# Import Perseus-imputed data
file.proteinGroups <- paste0(wd,"/datasets/CPTAC/proteinGroups_preprocessed_imputed.txt")
exprs_col <- grepEcols(file.proteinGroups, "LFQ.intensity.", split = "\t")
proteinGroups.CPTAC.Perseus.imp <- readMSnSet2(file.proteinGroups, ecol = exprs_col, fnames = c("Protein.IDs"), sep = "\t")
sampleNames(proteinGroups.CPTAC.Perseus.imp) <- str_replace(sampleNames(proteinGroups.CPTAC.Perseus.imp), "LFQ.intensity.", "") %>% make.names
sample.names <- paste0(sampleNames(proteinGroups.CPTAC) %>% substr(3, 3), sampleNames(proteinGroups.CPTAC) %>% substr(5, 5))
### Impute with KNN-1
proteinGroups.CPTAC.KNN1 <- impute(proteinGroups.CPTAC, method = "knn")
## Warning in knnimp(x, k, maxmiss = rowmax, maxp = maxp): 700 rows with more than 50 % entries missing;
## mean imputation used for these rows
cex.main <- 1.2
cex.lab <- 1
cex <- 1
par(mfrow=c(1,1))
### Impute with mixed imputation
count.proteins <- MSnSet2df(proteinGroups.CPTAC) %>% as_tibble %>% group_by(feature, condition) %>% summarize(n())
proteins_MNAR <- names(which(table(count.proteins$feature) != 3))
## Get a logical vector
MNAR <- rownames(exprs(proteinGroups.CPTAC)) %in% proteins_MNAR
print('Number of proteins that are Missing Not At Random')
## [1] "Number of proteins that are Missing Not At Random"
print(table(MNAR))
## MNAR
## FALSE TRUE
## 1061 401
## Perform a mixed imputation
# The DEP example in the vignette considers a protein to have missing values not at random (MNAR) if it has missing values in all replicates of at least one condition.
randna <- !MNAR
proteinGroups.CPTAC.mixed <- proteinGroups.CPTAC
exprs(proteinGroups.CPTAC.mixed)[randna, ] <- exprs(impute(proteinGroups.CPTAC, "knn"))[randna,]
## Warning in knnimp(x, k, maxmiss = rowmax, maxp = maxp): 700 rows with more than 50 % entries missing;
## mean imputation used for these rows
exprs(proteinGroups.CPTAC.mixed)[!randna, ] <- exprs(impute(proteinGroups.CPTAC, "MinProb"))[!randna,]
## Loading required package: imputeLCMD
## Loading required package: tmvtnorm
## Loading required package: mvtnorm
## Loading required package: gmm
## Loading required package: sandwich
##
## Attaching package: 'gmm'
## The following object is masked from 'package:lme4':
##
## checkConv
## Loading required package: norm
## Loading required package: pcaMethods
##
## Attaching package: 'pcaMethods'
## The following object is masked from 'package:stats':
##
## loadings
## Loading required package: impute
## [1] 0.2967546
### Impute with RegEM imputation
# 1. First remove this one protein that has no observed values (RegEM cannot handle this)
proteinGroups.CPTAC2 <- proteinGroups.CPTAC[rowSums(is.na(Biobase::exprs(proteinGroups.CPTAC))) != ncol(Biobase::exprs(proteinGroups.CPTAC)),]
# 2. Write protein info as a tab delimited file
write.table(cbind(protein = row.names(Biobase::exprs(proteinGroups.CPTAC2)), Biobase::exprs(proteinGroups.CPTAC2)), file = paste0(wd,"/datasets/CPTAC/proteinGroups_CPTAC.txt"), sep = "\t", quote = FALSE, row.names = FALSE, col.names = TRUE)
# 3. Download the MatLab files from https://github.com/tapios/RegEM#installation.
# 4. Execute the following code in MatLab:
# X=readmatrix("proteinGroups_CPTAC.txt")
# X_imputed=regem(X)
# writematrix(X_imputed,"proteinGroups_CPTAC_RegEM_imp.txt",'Delimiter','tab')
# The resulting proteinGroups_CPTAC_RegEM_imp.txt file can be found in the "/datasets/CPTAC" folder.
# 5. Import the imputed values
exprs.RegEM.imp <- read.table(file = paste0(wd,"/datasets/CPTAC/proteinGroups_CPTAC_RegEM_imp.txt"), sep = "\t", header = FALSE, quote = "", dec = ".")
# 6. Preprocess the data frame
exprs.RegEM.imp <- exprs.RegEM.imp[,-1]
dimnames(exprs.RegEM.imp) <- dimnames(exprs(proteinGroups.CPTAC2))
# 7. Put the imputed values in the MSnSet object proteins.CPTAC.RegEM.imp
proteinGroups.CPTAC.RegEM.imp <- proteinGroups.CPTAC2
Biobase::exprs(proteinGroups.CPTAC.RegEM.imp) <- exprs.RegEM.imp[,1:27] %>% as.matrix
We here show the sample-wise effect of Perseus imputation at the level of MaxQuant’s MaxLFQ-values (protein level). Blue are the original values, red are the imputed values.
# Histograms for the effect of Perseus imputation at the MaxLFQ-level (Supplementary Fig. 2)
# grDevices::svg(paste0(wd,"/Perseus_protein_sample_imp.svg"), width=60, height=90)
# cex.main <- 10
# cex.lab <- 8
# cex <- 6
# par(mfrow=c(1,3))
# par(mar=c(5.1*cex/2,4.1*cex/2,4.1*cex/2,2.1*cex/2))
# par(mfrow=c(9,3))
for(i in 1:ncol(exprs(proteinGroups.CPTAC.Perseus.imp))){
p1 <- hist(exprs(proteinGroups.CPTAC.Perseus.imp[,i]), breaks=25, plot = FALSE)
p2 <- hist(exprs(proteinGroups.CPTAC[,i]), breaks=25, plot = FALSE)
plot(p1, col=rgb(255,0,0, maxColorValue = 255), xlim=c(12,28), border=rgb(255,165,0, maxColorValue = 255), las = 1, main = paste0("Perseus imputation sample ", sample.names[i]), xlab = "log2(MaxLFQ intensity)", cex.lab=cex.lab, cex.axis=cex, cex.main=cex.main, cex.sub=cex, lwd=cex) # first histogram
plot(p2, col=rgb(0,0,255, maxColorValue = 255), xlim=c(12,28), border=rgb(0,165,255, maxColorValue = 255), add=TRUE) # second histogram
}
# dev.off()
We here show the sample-wise effect of Perseus imputation at the peptide level. Blue are the original values, red are the imputed values.
# Histograms for the effect of Perseus imputation at the peptide-level (Supplementary Fig. 3)
# grDevices::svg(paste0(wd,"/Perseus_sample_imp.svg"), width=60, height=90)
# cex.main <- 10
# cex.lab <- 8
# cex <- 6
# par(mfrow=c(1,3))
# par(mar=c(5.1*cex/2,4.1*cex/2,4.1*cex/2,2.1*cex/2))
# par(mfrow=c(9,3))
for(i in 1:ncol(exprs(peptides.CPTAC.Perseus.imp))){
p1 <- hist(exprs(peptides.CPTAC.Perseus.imp[,i]), breaks=25, plot = FALSE)
p2 <- hist(exprs(peptides.CPTAC2[,i]), breaks=25, plot = FALSE)
plot(p1, col=rgb(255,0,0, maxColorValue = 255), xlim=c(12,28), border=rgb(255,165,0, maxColorValue = 255), las = 1, main = paste0("Perseus imputation sample ", sample.names[i]), xlab = "log2(peptide intensity)", cex.lab=cex.lab, cex.axis=cex, cex.main=cex.main, cex.sub=cex, lwd=cex) # first histogram
plot(p2, col=rgb(0,0,255, maxColorValue = 255), xlim=c(12,28), border=rgb(0,165,255, maxColorValue = 255), add=TRUE) # second histogram
}
# dev.off()
condition.names <- unique(sampleNames(proteinGroups.CPTAC) %>% substr(3, 3))
cex.main <- 1.2
cex.lab <- 1
cex <- 1
par(mfrow=c(1,1))
# grDevices::svg(paste0(wd,"/histograms_all_imputations_proteins.svg"), width=60, height=120)
# cex.main <- 10
# cex.lab <- 8
# cex <- 6
# par(mfrow=c(4,3))
# par(mar=c(5.1*cex/2,4.1*cex/2,4.1*cex/2,2.1*cex/2))
# par(mfrow=c(9,3))
# Perseus imputation
for(i in 1:3){
p1 <- hist(exprs(proteinGroups.CPTAC.Perseus.imp)[,which(grepl(condition.names[i], sampleNames(proteinGroups.CPTAC.Perseus.imp)))], breaks=25, plot = FALSE)
p2 <- hist(exprs(proteinGroups.CPTAC)[,which(grepl(condition.names[i], sampleNames(proteinGroups.CPTAC)))], breaks=25, plot = FALSE)
plot(p1, col=rgb(255,0,0, maxColorValue = 255), xlim=c(13,28), border=rgb(255,165,0, maxColorValue = 255), las = 1, main = paste0("Perseus imputation condition ", condition.names[i]), xlab = "log2(MaxLFQ intensity)", cex.lab=cex.lab, cex.axis=cex, cex.main=cex.main, cex.sub=cex, lwd=cex) # first histogram
plot(p2, col=rgb(0,0,255, maxColorValue = 255), xlim=c(13,28), border=rgb(0,165,255, maxColorValue = 255), add=TRUE) # second histogram
}
# kNN imputation
for(i in 1:3){
p1 <- hist(exprs(proteinGroups.CPTAC.KNN1)[,which(grepl(condition.names[i], sampleNames(proteinGroups.CPTAC.KNN1)))], breaks=25, plot = FALSE)
p2 <- hist(exprs(proteinGroups.CPTAC)[,which(grepl(condition.names[i], sampleNames(proteinGroups.CPTAC)))], breaks=25, plot = FALSE)
plot(p1, col=rgb(255,0,0, maxColorValue = 255), xlim=c(13,28), border=rgb(255,165,0, maxColorValue = 255), las = 1, main = paste0("kNN imputation condition ", condition.names[i]), xlab = "log2(MaxLFQ intensity)", cex.lab=cex.lab, cex.axis=cex, cex.main=cex.main, cex.sub=cex, lwd=cex) # first histogram
plot(p2, col=rgb(0,0,255, maxColorValue = 255), xlim=c(13,28), border=rgb(0,165,255, maxColorValue = 255), add=TRUE) # second histogram
}
# Mixed imputation
for(i in 1:3){
p1 <- hist(exprs(proteinGroups.CPTAC.mixed)[,which(grepl(condition.names[i], sampleNames(proteinGroups.CPTAC.mixed)))], breaks=25, plot = FALSE)
p2 <- hist(exprs(proteinGroups.CPTAC)[,which(grepl(condition.names[i], sampleNames(proteinGroups.CPTAC)))], breaks=25, plot = FALSE)
plot(p1, col=rgb(255,0,0, maxColorValue = 255), xlim=c(13,28), border=rgb(255,165,0, maxColorValue = 255), las = 1, main = paste0("Mixed imputation condition ", condition.names[i]), xlab = "log2(MaxLFQ intensity)", cex.lab=cex.lab, cex.axis=cex, cex.main=cex.main, cex.sub=cex, lwd=cex) # first histogram
plot(p2, col=rgb(0,0,255, maxColorValue = 255), xlim=c(13,28), border=rgb(0,165,255, maxColorValue = 255), add=TRUE) # second histogram
}
# RegEM imputation
for(i in 1:3){
p1 <- hist(exprs(proteinGroups.CPTAC.RegEM.imp)[,which(grepl(condition.names[i], sampleNames(proteinGroups.CPTAC.RegEM.imp)))], breaks=25, plot = FALSE)
p2 <- hist(exprs(proteinGroups.CPTAC)[,which(grepl(condition.names[i], sampleNames(proteinGroups.CPTAC)))], breaks=25, plot = FALSE)
plot(p1, col=rgb(255,0,0, maxColorValue = 255), xlim=c(13,28), border=rgb(255,165,0, maxColorValue = 255), las = 1, main = paste0("RegEM imputation condition ", condition.names[i]), xlab = "log2(MaxLFQ intensity)", cex.lab=cex.lab, cex.axis=cex, cex.main=cex.main, cex.sub=cex, lwd=cex) # first histogram
plot(p2, col=rgb(0,0,255, maxColorValue = 255), xlim=c(13,28), border=rgb(0,165,255, maxColorValue = 255), add=TRUE) # second histogram
}
# dev.off()
condition.names <- unique(sampleNames(peptides.CPTAC.KNN1) %>% substr(3, 3))
cex.main <- 1.2
cex.lab <- 1
cex <- 1
par(mfrow=c(1,1))
# grDevices::svg(paste0(wd,"/histograms_all_imputations_peptides.svg"), width=60, height=120)
# cex.main <- 10
# cex.lab <- 8
# cex <- 6
# par(mfrow=c(4,3))
# par(mar=c(5.1*cex/2,4.1*cex/2,4.1*cex/2,2.1*cex/2))
# par(mfrow=c(9,3))
# Perseus imputation
for(i in 1:3){
p1 <- hist(exprs(peptides.CPTAC.Perseus.imp)[,which(grepl(condition.names[i], sampleNames(peptides.CPTAC.Perseus.imp)))], breaks=25, plot = FALSE)
p2 <- hist(exprs(peptides.CPTAC2)[,which(grepl(condition.names[i], sampleNames(peptides.CPTAC2)))], breaks=25, plot = FALSE)
plot(p1, col=rgb(255,0,0, maxColorValue = 255), xlim=c(13,28), border=rgb(255,165,0, maxColorValue = 255), las = 1, main = paste0("Perseus imputation condition ", condition.names[i]), xlab = "log2(peptide intensity)", cex.lab=cex.lab, cex.axis=cex, cex.main=cex.main, cex.sub=cex, lwd=cex) # first histogram
plot(p2, col=rgb(0,0,255, maxColorValue = 255), xlim=c(13,28), border=rgb(0,165,255, maxColorValue = 255), add=TRUE) # second histogram
}
# kNN imputation
for(i in 1:3){
p1 <- hist(exprs(peptides.CPTAC.KNN1)[,which(grepl(condition.names[i], sampleNames(peptides.CPTAC.KNN1)))], breaks=25, plot = FALSE)
p2 <- hist(exprs(peptides.CPTAC2)[,which(grepl(condition.names[i], sampleNames(peptides.CPTAC2)))], breaks=25, plot = FALSE)
plot(p1, col=rgb(255,0,0, maxColorValue = 255), xlim=c(13,28), border=rgb(255,165,0, maxColorValue = 255), las = 1, main = paste0("kNN imputation condition ", condition.names[i]), xlab = "log2(peptide intensity)", cex.lab=cex.lab, cex.axis=cex, cex.main=cex.main, cex.sub=cex, lwd=cex) # first histogram
plot(p2, col=rgb(0,0,255, maxColorValue = 255), xlim=c(13,28), border=rgb(0,165,255, maxColorValue = 255), add=TRUE) # second histogram
}
# Mixed imputation
for(i in 1:3){
p1 <- hist(exprs(peptides.CPTAC.mixed)[,which(grepl(condition.names[i], sampleNames(peptides.CPTAC.mixed)))], breaks=25, plot = FALSE)
p2 <- hist(exprs(peptides.CPTAC2)[,which(grepl(condition.names[i], sampleNames(peptides.CPTAC2)))], breaks=25, plot = FALSE)
plot(p1, col=rgb(255,0,0, maxColorValue = 255), xlim=c(13,28), border=rgb(255,165,0, maxColorValue = 255), las = 1, main = paste0("Mixed imputation condition ", condition.names[i]), xlab = "log2(peptide intensity)", cex.lab=cex.lab, cex.axis=cex, cex.main=cex.main, cex.sub=cex, lwd=cex) # first histogram
plot(p2, col=rgb(0,0,255, maxColorValue = 255), xlim=c(13,28), border=rgb(0,165,255, maxColorValue = 255), add=TRUE) # second histogram
}
# RegEM imputation
for(i in 1:3){
p1 <- hist(exprs(peptides.CPTAC.RegEM.imp)[,which(grepl(condition.names[i], sampleNames(peptides.CPTAC.RegEM.imp)))], breaks=25, plot = FALSE)
p2 <- hist(exprs(peptides.CPTAC2)[,which(grepl(condition.names[i], sampleNames(peptides.CPTAC2)))], breaks=25, plot = FALSE)
plot(p1, col=rgb(255,0,0, maxColorValue = 255), xlim=c(13,28), border=rgb(255,165,0, maxColorValue = 255), las = 1, main = paste0("RegEM imputation condition ", condition.names[i]), xlab = "log2(peptide intensity)", cex.lab=cex.lab, cex.axis=cex, cex.main=cex.main, cex.sub=cex, lwd=cex) # first histogram
plot(p2, col=rgb(0,0,255, maxColorValue = 255), xlim=c(13,28), border=rgb(0,165,255, maxColorValue = 255), add=TRUE) # second histogram
}
# dev.off()
# Due to different preprocessing methods, some proteins cannot be detected with some methods.
# By default we use the full set, i.e. all protein identifiers over all methods.
# Note that all methods used either the preprocessing of MSqRob, of Perseus o of MSstats.
all.proteins <- unique(unlist(lapply(list(res.CPTAC.Hurdle, CPTAC.Perseus.full, res.CPTAC.MSstats.noimp.full), "[", "protein")))
full.Base <- tibble(protein = rep(all.proteins, 3),
UPS = grepl("ups", protein),
contrast = rep(c("condition6B-condition6A",
"condition6C-condition6A",
"condition6C-condition6B"), each = length(all.proteins)))
# We will also make some figure with the common set, i.e. only the proteins that have been seen in the preprocessing of al methods.
common.proteins <- unique(unlist(lapply(list(res.CPTAC.Hurdle, CPTAC.Perseus.full, res.CPTAC.MSstats.noimp.full), "[", "protein")))
indices <- which(table(c(unique(as.character(res.CPTAC.Hurdle$protein)), unique(as.character(CPTAC.Perseus.full$protein)), unique(as.character(res.CPTAC.MSstats.noimp.full$protein)))) == 3)
common.proteins <- names(table(c(unique(as.character(res.CPTAC.Hurdle$protein)), unique(as.character(CPTAC.Perseus.full$protein)), unique(as.character(res.CPTAC.MSstats.noimp.full$protein)))))[indices]
common.Base <- tibble(protein = rep(common.proteins, 3),
UPS = grepl("ups", protein),
contrast = rep(c("condition6B-condition6A",
"condition6C-condition6A",
"condition6C-condition6B"), each = length(common.proteins)))
# FDR-FTP plot for all imputation-based methods
res.list <- list(res.CPTAC.full,
res.CPTAC.KNN1.full,
res.CPTAC.PI.full,
res.CPTAC.RegEM.full,
res.CPTAC.mixed.full
)
res.list <- lapply(res.list, function(x){
left_join(full.Base, x)
}
)
## Joining, by = c("protein", "UPS", "contrast")
## Warning: Column `protein` joining factor and character vector, coercing
## into character vector
## Joining, by = c("protein", "UPS", "contrast")
## Warning: Column `protein` joining factor and character vector, coercing
## into character vector
## Joining, by = c("protein", "UPS", "contrast")
## Warning: Column `protein` joining factor and character vector, coercing
## into character vector
## Joining, by = c("protein", "UPS", "contrast")
## Warning: Column `protein` joining factor and character vector, coercing
## into character vector
## Joining, by = c("protein", "UPS", "contrast")
## Warning: Column `protein` joining factor and character vector, coercing
## into character vector
contrast.vec <- c("condition6B-condition6A",
"condition6C-condition6A",
"condition6C-condition6B")
names(contrast.vec) <- c("Condition 6B vs. 6A",
"Condition 6C vs. 6A",
"Condition 6C vs. 6B")
colors <- c("#FF681E", "#88FF9C", "blue", "black", "brown")
sort.list <- list(c("qvalue", "pvalue", "t", "logFC"),
c("qvalue", "pvalue", "t", "logFC"),
c("qvalue", "pvalue", "t", "logFC"),
c("qvalue", "pvalue", "t", "logFC"),
c("qvalue", "pvalue", "t", "logFC"))
PlotFDPTPR(res.list, contrast.vec, colors, sort.list, TPcol = c("UPS"), plotSVG = FALSE) # TRUE
## condition6B-condition6A
## 3.44827586206897%
## 6.66666666666667%
## 12.5%
## condition6B-condition6A
## 0%
## 0%
## 0%
## condition6B-condition6A
## 0%
## 0%
## 0%
## condition6B-condition6A
## 0%
## 0%
## 0%
## condition6B-condition6A
## 0%
## 0%
## 0%
## condition6C-condition6A
## 9.375%
## 26.1904761904762%
## 35.4166666666667%
## condition6C-condition6A
## 0%
## 0%
## 0%
## condition6C-condition6A
## 0%
## 0%
## 0%
## condition6C-condition6A
## 0%
## 0%
## 0%
## condition6C-condition6A
## 0%
## 0%
## 0%
## condition6C-condition6B
## 0%
## 0%
## 0%
## condition6C-condition6B
## 0%
## 0%
## 0%
## condition6C-condition6B
## 0%
## 0%
## 0%
## condition6C-condition6B
## 0%
## 0%
## 0%
## condition6C-condition6B
## 0%
## 0%
## 0%
# FDR-FTP plots for the count-based methods
res.list <- list(res.CPTAC.sc.co.full,
res.CPTAC.edgeR.sc.full,
res.CPTAC.beta.binom.sc.full,
res.CPTAC.co.full,
res.CPTAC.edgeR.full,
res.CPTAC.beta.binom.full,
res.CPTAC.full
)
res.list <- lapply(res.list, function(x){
left_join(full.Base, x)
}
)
## Joining, by = c("protein", "UPS", "contrast")
## Warning: Column `protein` joining factors with different levels, coercing
## to character vector
## Joining, by = c("protein", "UPS", "contrast")
## Warning: Column `protein` joining factor and character vector, coercing
## into character vector
## Joining, by = c("protein", "UPS", "contrast")
## Warning: Column `protein` joining factor and character vector, coercing
## into character vector
## Joining, by = c("protein", "UPS", "contrast")
## Warning: Column `protein` joining factors with different levels, coercing
## to character vector
## Joining, by = c("protein", "UPS", "contrast")
## Warning: Column `protein` joining factor and character vector, coercing
## into character vector
## Joining, by = c("protein", "UPS", "contrast")
## Warning: Column `protein` joining factor and character vector, coercing
## into character vector
## Joining, by = c("protein", "UPS", "contrast")
## Warning: Column `protein` joining factor and character vector, coercing
## into character vector
contrast.vec <- c("condition6B-condition6A",
"condition6C-condition6A",
"condition6C-condition6B")
names(contrast.vec) <- c("Condition 6B vs. 6A",
"Condition 6C vs. 6A",
"Condition 6C vs. 6B")
colors <- c("magenta", "lightgreen", "gold1", "#E15E9E", "darkgreen","yellow3", "#FF681E")
sort.list <- list(c("qvalue", "pvalue", "t", "logOR"),
c("FDR", "PValue", "F", "logFC"),
c("qvalue", "p.value"),
c("qvalue", "pvalue", "t", "logOR"),
c("FDR", "PValue", "F", "logFC"),
c("qvalue", "p.value"),
c("qvalue", "pvalue", "t", "logFC"))
PlotFDPTPR(res.list, contrast.vec, colors, sort.list, TPcol = c("UPS"), plotSVG = FALSE) # TRUE
## condition6B-condition6A
## 0%
## 0%
## 0%
## condition6B-condition6A
## 0%
## 0%
## 0%
## condition6B-condition6A
## 0%
## 0%
## 0%
## condition6B-condition6A
## 0%
## 0%
## 0%
## condition6B-condition6A
## 0%
## 0%
## 0%
## condition6B-condition6A
## 0%
## 0%
## 0%
## condition6B-condition6A
## 3.44827586206897%
## 6.66666666666667%
## 12.5%
## condition6C-condition6A
## 0%
## 0%
## 0%
## condition6C-condition6A
## 0%
## 0%
## 0%
## condition6C-condition6A
## 0%
## 0%
## 0%
## condition6C-condition6A
## 0%
## 0%
## 0%
## condition6C-condition6A
## 0%
## 0%
## 0%
## condition6C-condition6A
## 0%
## 0%
## 0%
## condition6C-condition6A
## 9.375%
## 26.1904761904762%
## 35.4166666666667%
## condition6C-condition6B
## 0%
## 0%
## 0%
## condition6C-condition6B
## 0%
## 0%
## 0%
## condition6C-condition6B
## 0%
## 0%
## 0%
## condition6C-condition6B
## 0%
## 0%
## 0%
## condition6C-condition6B
## 0%
## 0%
## 0%
## condition6C-condition6B
## 0%
## 0%
## 0%
## condition6C-condition6B
## 0%
## 0%
## 0%
Plot the correlation between intensities and counts.
# Make the plots
# BA, CB, CA
upratio <- c(1.565597, 1.571906, 3.137504)
# 1. Top: condition 6B vs. 6A
TP.BvsA <- res.CPTAC.Hurdle %>% filter(contrast == "condition6B-condition6A" & UPS)
FP.BvsA <- res.CPTAC.Hurdle %>% filter(contrast == "condition6B-condition6A" & !UPS)
scatterHist(x = ((FP.BvsA$logOR) %>% unlist),
y = ((FP.BvsA$logFC) %>% unlist),
x2 = ((TP.BvsA$logOR) %>% unlist),
y2 = ((TP.BvsA$logFC) %>% unlist),
main = "Condition B vs. A", plotSVG = FALSE, filename = paste0(wd,"/scatterhistBA.svg")) #, plotSVG = TRUE, filename = paste0(wd,"/scatterhistBA.svg")
sum(is.na(((FP.BvsA$logFC) %>% unlist)))
## [1] 159
# 159 yeast proteins without a log2FC estimate
sum(is.na(((TP.BvsA$logFC) %>% unlist)))
## [1] 2
# 2 UPS1 proteins without a log2FC estimate
# 2. Middle: condition 6C vs. 6A
TP.CvsA <- res.CPTAC.Hurdle %>% filter(contrast == "condition6C-condition6A" & UPS)
FP.CvsA <- res.CPTAC.Hurdle %>% filter(contrast == "condition6C-condition6A" & !UPS)
scatterHist(x = ((FP.CvsA$logOR) %>% unlist),
y = ((FP.CvsA$logFC) %>% unlist),
x2 = ((TP.CvsA$logOR) %>% unlist),
y2 = ((TP.CvsA$logFC) %>% unlist),
main = "Condition C vs. A", plotSVG = FALSE, filename = paste0(wd,"/scatterhistCA.svg")) #, plotSVG = TRUE, filename = paste0(wd,"/scatterhistCA.svg")
sum(is.na(((FP.CvsA$logFC) %>% unlist)))
## [1] 158
# 158 yeast proteins without a log2FC estimate
sum(is.na(((TP.CvsA$logFC) %>% unlist)))
## [1] 2
# 2 UPS1 proteins without a log2FC estimate
# 3. Bottom: condition 6C vs. 6B
TP.CvsB <- res.CPTAC.Hurdle %>% filter(contrast == "condition6C-condition6B" & UPS)
FP.CvsB <- res.CPTAC.Hurdle %>% filter(contrast == "condition6C-condition6B" & !UPS)
scatterHist(x = ((FP.CvsB$logOR) %>% unlist),
y = ((FP.CvsB$logFC) %>% unlist),
x2 = ((TP.CvsB$logOR) %>% unlist),
y2 = ((TP.CvsB$logFC) %>% unlist),
main = "Condition C vs. B", plotSVG = FALSE, filename = paste0(wd,"/scatterhistCB.svg")) #, plotSVG = TRUE, filename = paste0(wd,"/scatterhistCB.svg")
sum(is.na(((FP.CvsB$logFC) %>% unlist)))
## [1] 159
# 159 yeast proteins without a log2FC estimate
sum(is.na(((TP.CvsB$logFC) %>% unlist)))
## [1] 2
# 2 UPS1 proteins without a log2FC estimate
# Create a mock experimental design where there is no differential abundance in reality
# There is however a huge "batch effect" of "condlab", the combined effects of spike-in condition and lab
pData <- pData(peptides.CPTAC2)
pData$condlab <- paste0(pData$condition,pData$lab)
pData$treat <- rep(c("one","two","three"),9)
pData$treat <- factor(pData$treat, levels = c("one", "two", "three"))
peptides.CPTAC2.noeff <- MSnSet(Biobase::exprs(peptides.CPTAC2), fData = AnnotatedDataFrame(fData(peptides.CPTAC2)), pData = AnnotatedDataFrame(pData))
### Recreate peptides.CPTAC3 ###
# We require by default at least 2 identifications of a peptide sequence, as with 1 identification, the model will be perfectly confounded
sel <- rowSums(!is.na(Biobase::exprs(peptides.CPTAC2))) >= 2
peptides.CPTAC3 <- peptides.CPTAC2[sel]
# Again remove proteins that are only identified by one peptide
sel <- fData(peptides.CPTAC3) %>% group_by(protein) %>% mutate(flag = n() > 1) %>% pull(flag)
peptides.CPTAC3 <- peptides.CPTAC3[sel]
# Drop levels
peptides.CPTAC3 <- MSnbase::MSnSet(exprs = Biobase::exprs(peptides.CPTAC3), fData = droplevels(Biobase::fData(peptides.CPTAC3)), pData = Biobase::pData(peptides.CPTAC3))
dim(peptides.CPTAC2)
## [1] 9377 27
# 9377 27
dim(peptides.CPTAC3)
## [1] 9158 27
# 9158 27
length(unique(fData(peptides.CPTAC2)$protein))
## [1] 1381
# 1381
length(unique(fData(peptides.CPTAC3)$protein))
## [1] 1347
# 1347
rm(sel)
gc()
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 6337209 338.5 11658460 622.7 NA 11658460 622.7
## Vcells 23666720 180.6 41192704 314.3 16384 34260408 261.4
######
peptides.CPTAC3.noeff <- MSnSet(Biobase::exprs(peptides.CPTAC3), fData = AnnotatedDataFrame(fData(peptides.CPTAC3)), pData = AnnotatedDataFrame(pData))
# Create contrast matrix for the mock analysis
form <- expression ~ (1|treat) + (1|condlab) + (1|sample) + (1|sequence)
contrasts <- contrast_helper(form, peptides.CPTAC3.noeff, treat)
contrasts <- contrasts[c(1,3,2),c(2,1,3)]
colnames(contrasts)[3] <- "treatthree-treattwo"
contrasts[c(2,3),3] <- c(-1,1)
contrasts
## treattwo-treatone treatthree-treatone treatthree-treattwo
## treatone -1 -1 0
## treattwo 1 0 -1
## treatthree 0 1 1
# Fit the models, this might take a few minutes
# res.CPTAC.noeff <- do_mm(formula = form, msnset = peptides.CPTAC3.noeff, group_var = protein, contrasts = contrasts, type_df = "conservative", max_iter = 20)
# Or load the data
load(paste0(wd, "/save_files_mock_CPTAC/res_CPTAC_noeff.RData"))
annotation.CPTAC <- tibble(protein = fData(peptides.CPTAC2)$protein, UPS = grepl("ups",fData(peptides.CPTAC2)$protein), gene.name = fData(peptides.CPTAC2)$gene.name, protein.name = fData(peptides.CPTAC2)$protein.name) %>% distinct
res.CPTAC.Base <- rbind(annotation.CPTAC, annotation.CPTAC, annotation.CPTAC)
res.CPTAC.Base$contrast <- c(rep("treattwo-treatone",nrow(annotation.CPTAC)), rep("treatthree-treatone",nrow(annotation.CPTAC)), rep("treatthree-treattwo",nrow(annotation.CPTAC)))
res.CPTAC.noeff.full <- res.CPTAC.Base %>% left_join(res.CPTAC.noeff$result)
## Joining, by = c("protein", "contrast")
## Warning: Column `protein` joining factors with different levels, coercing
## to character vector
res.CPTAC.noeff.full <- res.CPTAC.noeff.full %>% arrange(pvalue)
prot.CPTAC.noeff.co <- do_count_groups(peptides.CPTAC2.noeff, group_var = protein, keep_fData_cols = c("gene.name","protein.name"))
# Remove estimates based on only one sample in one or more conditions from the data
not_one <- which((Biobase::exprs(prot.CPTAC.noeff.co)[,seq(1,25, by=3)] > 0) %>% rowSums < 2)
not_two <- which((Biobase::exprs(prot.CPTAC.noeff.co)[,seq(2,26, by=3)] > 0) %>% rowSums < 2)
not_three <- which((Biobase::exprs(prot.CPTAC.noeff.co)[,seq(3,27, by=3)] > 0) %>% rowSums < 2)
### Important: re-adjust the false discovery rate! ###
res.CPTAC.noeff.full[(res.CPTAC.noeff.full$protein %in% names(not_one)) & (res.CPTAC.noeff.full$contrast %in% c("treattwo-treatone", "treatthree-treatone")),][,c("logFC", "se", "t", "df", "pvalue", "qvalue")] <- NA
res.CPTAC.noeff.full[(res.CPTAC.noeff.full$protein %in% names(not_two)) & (res.CPTAC.noeff.full$contrast %in% c("treattwo-treatone", "treatthree-treattwo")),][,c("logFC", "se", "t", "df", "pvalue", "qvalue")] <- NA
res.CPTAC.noeff.full[(res.CPTAC.noeff.full$protein %in% names(not_three)) & (res.CPTAC.noeff.full$contrast %in% c("treatthree-treatone", "treatthree-treattwo")),][,c("logFC", "se", "t", "df", "pvalue", "qvalue")] <- NA
res.CPTAC.noeff.full <- res.CPTAC.noeff.full %>% group_by(contrast) %>%
mutate(qvalue = p.adjust(pvalue, method = "BH"))
# Or load the MSqRob result object via:
# load(paste0(wd, "/save_files_mock_CPTAC/res_CPTAC_noeff_full.RData"))
### Fit the quasi-binomial model ###
form.co <- ~ treat + condlab
# contrasts <- contrast_helper(~ treat, peptides.CPTAC2.noeff, treat)
# res.CPTAC.noeff.co <- do_glm(formula = form.co, msnset = prot.CPTAC.noeff.co, group_var = protein, contrasts = contrasts, contFun = "contEst")
# OR: load the model:
load(paste0(wd, "/save_files_mock_CPTAC/res_CPTAC_noeff_co.RData"))
res.CPTAC.noeff.co.full <- res.CPTAC.Base %>% left_join(res.CPTAC.noeff.co$result)
## Joining, by = c("protein", "contrast")
res.CPTAC.noeff.co.full <- res.CPTAC.noeff.co.full %>% arrange(pvalue)
# cols <- colorRampPalette(c("#FF3100", "#FCFF00", "#45FE4F",
# "#00FEFF", "#000099"
# ))(256)
# col = cols[ceiling(rank(HEART_A_vsV$pchisq)/length(HEART_A_vsV$pchisq)*256)]
# Order results according to protein and contrast instead of p-value
res.intensities.sorted <- res.CPTAC.noeff.full %>% arrange(contrast, protein)
res.counts.sorted <- res.CPTAC.noeff.co.full %>% arrange(contrast, protein)
all(res.intensities.sorted$protein == res.counts.sorted$protein)
## [1] TRUE
all(res.intensities.sorted$contrast == res.counts.sorted$contrast)
## [1] TRUE
# library(svglite)
# svglite(file = "all_mock_correlations.svg", width = 14, height = 14)
# # svglite(file = "no_correlation_MSqRob_qbinom_21.svg", width = 7, height = 7)
# par(mar=c(5.1,6.1,4.1,2.1))
# par(mfrow = c(2,2))
plot(res.intensities.sorted[res.intensities.sorted$contrast == "treattwo-treatone",]$logFC, res.counts.sorted[res.counts.sorted$contrast == "treattwo-treatone",]$logOR, col = c("black","red")[as.numeric(res.intensities.sorted$UPS)+1], pch = 20, xlab = "MSqRob log2FC", ylab = "Quasi-binomial log2OR", las = 1, frame.plot = FALSE, main = "Nonsense treatment 2 vs. 1", cex.lab = 2, cex.axis = 2, cex.main = 2) #[1:1000]
abline(v = 0, col = "black")
abline(h = 0, col = "black")
# # par(mar=c(5.1,4.1,4.1,2.1))
# # dev.off()
#
# # library(svglite)
# # svglite(file = "no_correlation_MSqRob_qbinom_31.svg", width = 7, height = 7)
# # par(mar=c(5.1,6.1,4.1,2.1))
plot(res.intensities.sorted[res.intensities.sorted$contrast == "treatthree-treatone",]$logFC, res.counts.sorted[res.counts.sorted$contrast == "treatthree-treatone",]$logOR, col = c("black","red")[as.numeric(res.intensities.sorted$UPS)+1], pch = 20, xlab = "MSqRob log2FC", ylab = "Quasi-binomial log2OR", las = 1, frame.plot = FALSE, main = "Nonsense treatment 3 vs. 1", cex.lab = 2, cex.axis = 2, cex.main = 2) #[1:1000]
abline(v = 0, col = "black")
abline(h = 0, col = "black")
# # par(mar=c(5.1,4.1,4.1,2.1))
# # dev.off()
#
# # library(svglite)
# # svglite(file = "no_correlation_MSqRob_qbinom_32.svg", width = 7, height = 7)
# # par(mar=c(5.1,6.1,4.1,2.1))
plot(res.intensities.sorted[res.intensities.sorted$contrast == "treatthree-treattwo",]$logFC, res.counts.sorted[res.counts.sorted$contrast == "treatthree-treattwo",]$logOR, col = c("black","red")[as.numeric(res.intensities.sorted$UPS)+1], pch = 20, xlab = "MSqRob log2FC", ylab = "Quasi-binomial log2OR", las = 1, frame.plot = FALSE, main = "Nonsense treatment 3 vs. 2", cex.lab = 2, cex.axis = 2, cex.main = 2) #[1:1000]
abline(v = 0, col = "black")
abline(h = 0, col = "black")
# # par(mar=c(5.1,4.1,4.1,2.1))
# par(mfrow = c(1,1))
# dev.off()
# FDR-FTP plot for methods that do not use Benjamini-Hochberg FDR by default
res.list <- list(CPTAC.Perseus.imp.full,
CPTAC.Perseus.full,
CPTAC.Perseus.imp.full,
CPTAC.Perseus.full,
res.CPTAC.DEP,
res.CPTAC.DEP.noimp,
res.CPTAC.DEP,
res.CPTAC.DEP.noimp
)
res.list <- lapply(res.list, function(x){
left_join(full.Base, x)
}
)
## Joining, by = c("protein", "UPS", "contrast")
## Warning: Column `protein` joining factors with different levels, coercing
## to character vector
## Joining, by = c("protein", "UPS", "contrast")
## Warning: Column `protein` joining factors with different levels, coercing
## to character vector
## Joining, by = c("protein", "UPS", "contrast")
## Warning: Column `protein` joining factors with different levels, coercing
## to character vector
## Joining, by = c("protein", "UPS", "contrast")
## Warning: Column `protein` joining factors with different levels, coercing
## to character vector
## Joining, by = c("protein", "UPS", "contrast")
## Warning: Column `protein` joining factor and character vector, coercing
## into character vector
## Warning: Column `contrast` has different attributes on LHS and RHS of join
## Joining, by = c("protein", "UPS", "contrast")
## Warning: Column `protein` joining factor and character vector, coercing
## into character vector
## Warning: Column `contrast` has different attributes on LHS and RHS of join
## Joining, by = c("protein", "UPS", "contrast")
## Warning: Column `protein` joining factor and character vector, coercing
## into character vector
## Warning: Column `contrast` has different attributes on LHS and RHS of join
## Joining, by = c("protein", "UPS", "contrast")
## Warning: Column `protein` joining factor and character vector, coercing
## into character vector
## Warning: Column `contrast` has different attributes on LHS and RHS of join
contrast.vec <- c("condition6B-condition6A",
"condition6C-condition6A",
"condition6C-condition6B")
names(contrast.vec) <- c("Condition 6B vs. 6A",
"Condition 6C vs. 6A",
"Condition 6C vs. 6B")
colors <- c("green", "blue", "#42B7BA", "#1B2944", "darkgoldenrod", "#6B3D12", "peru", "yellow3")
sort.list <- list(c("q.BH", "p.value", "Test.statistic", "Difference"), # Benjamini-Hochberg FDR
c("q.BH", "p.value", "Test.statistic", "Difference"), # Benjamini-Hochberg FDR
c("q.value", "p.value", "Test.statistic", "Difference"), # permutation-based FDR
c("q.value", "p.value", "Test.statistic", "Difference"), # permutation-based FDR
c("q.BH", "pvalue", "logFC"), # Benjamini-Hochberg FDR
c("q.BH", "pvalue", "logFC"), # Benjamini-Hochberg FDR
c("qvalue", "pvalue", "logFC"), # local fdr
c("qvalue", "pvalue", "logFC")) # local fdr
PlotFDPTPR(res.list, contrast.vec, colors, sort.list, TPcol = c("UPS"), plotSVG = FALSE) # TRUE
## condition6B-condition6A
## 0%
## 0%
## 0%
## condition6B-condition6A
## 0%
## 0%
## 0%
## condition6B-condition6A
## 0%
## 0%
## 0%
## condition6B-condition6A
## 0%
## 0%
## 0%
## condition6B-condition6A
## 0%
## 0%
## 0%
## condition6B-condition6A
## 0%
## 0%
## 5.26315789473684%
## condition6B-condition6A
## 48.6486486486487%
## 64.1509433962264%
## 75.6410256410256%
## condition6B-condition6A
## 16.6666666666667%
## 22.2222222222222%
## 25%
## condition6C-condition6A
## 0%
## 0%
## 0%
## condition6C-condition6A
## 14.2857142857143%
## 16%
## 29.0322580645161%
## condition6C-condition6A
## 0%
## 0%
## 0%
## condition6C-condition6A
## 13.0434782608696%
## 21.4285714285714%
## 29.0322580645161%
## condition6C-condition6A
## 0%
## 0%
## 0%
## condition6C-condition6A
## 0%
## 4.16666666666667%
## 17.2413793103448%
## condition6C-condition6A
## 4.54545454545455%
## 8.69565217391304%
## 12.5%
## condition6C-condition6A
## 24.2424242424242%
## 40.4761904761905%
## 48.9795918367347%
## condition6C-condition6B
## 0%
## 0%
## 0%
## condition6C-condition6B
## 0%
## 0%
## 4.54545454545455%
## condition6C-condition6B
## 0%
## 0%
## 0%
## condition6C-condition6B
## 0%
## 4.54545454545455%
## 8%
## condition6C-condition6B
## 0%
## 0%
## 0%
## condition6C-condition6B
## 0%
## 0%
## 0%
## condition6C-condition6B
## 9.09090909090909%
## 13.0434782608696%
## 33.3333333333333%
## condition6C-condition6B
## 13.8888888888889%
## 26.1904761904762%
## 31.1111111111111%
res.list <- list(CPTAC.Perseus.imp.full,
CPTAC.Perseus.full,
res.CPTAC.DEP,
res.CPTAC.mixture,
res.CPTAC.ProDA.full,
res.CPTAC.DEP.noimp,
res.CPTAC.co.full,
res.CPTAC.full,
res.CPTAC.Hurdle)
res.list <- lapply(res.list, function(x){
left_join(common.Base, x)
}
)
## Joining, by = c("protein", "UPS", "contrast")
## Warning: Column `protein` joining character vector and factor, coercing
## into character vector
## Joining, by = c("protein", "UPS", "contrast")
## Warning: Column `protein` joining character vector and factor, coercing
## into character vector
## Joining, by = c("protein", "UPS", "contrast")
## Warning: Column `contrast` has different attributes on LHS and RHS of join
## Joining, by = c("protein", "UPS", "contrast")
## Warning: Column `protein` joining character vector and factor, coercing
## into character vector
## Joining, by = c("protein", "UPS", "contrast")Joining, by = c("protein",
## "UPS", "contrast")
## Warning: Column `contrast` has different attributes on LHS and RHS of join
## Joining, by = c("protein", "UPS", "contrast")
## Warning: Column `protein` joining character vector and factor, coercing
## into character vector
## Joining, by = c("protein", "UPS", "contrast")Joining, by = c("protein",
## "UPS", "contrast")
## Warning: Column `protein` joining character vector and factor, coercing
## into character vector
contrast.vec <- c("condition6B-condition6A",
"condition6C-condition6A",
"condition6C-condition6B")
names(contrast.vec) <- c("Condition 6B vs. 6A",
"Condition 6C vs. 6A",
"Condition 6C vs. 6B")
colors <- c("#42B7BA", "#1B2944", "peru", "#FFC30B", "#787878", "yellow3", "#E15E9E", "#FF681E", "#5A2A82")
sort.list <- list(c("q.value", "p.value", "Test.statistic", "Difference"),
c("q.value", "p.value", "Test.statistic", "Difference"),
c("qvalue", "pvalue", "logFC"),
c("qvalue", "pvalue"),
c("qval", "pval", "t", "logFC"),
c("qvalue", "pvalue", "logFC"),
c("qvalue", "pvalue", "t", "logOR"),
c("qvalue", "pvalue", "t", "logFC"),
c("qchisq", "pchisq", "chisq", NA))
PlotFDPTPR(res.list, contrast.vec, colors, sort.list, TPcol = c("UPS"), plotSVG = FALSE) # TRUE
## condition6B-condition6A
## 0%
## 0%
## 0%
## condition6B-condition6A
## 0%
## 0%
## 0%
## condition6B-condition6A
## 48.5714285714286%
## 64%
## 75.3424657534247%
## condition6B-condition6A
## 0%
## 0%
## 0%
## condition6B-condition6A
## 9.52380952380952%
## 9.52380952380952%
## 13.6363636363636%
## condition6B-condition6A
## 17.3913043478261%
## 23.0769230769231%
## 25.9259259259259%
## condition6B-condition6A
## 0%
## 0%
## 0%
## condition6B-condition6A
## 3.57142857142857%
## 6.89655172413793%
## 12.9032258064516%
## condition6B-condition6A
## 0%
## 3.57142857142857%
## 6.89655172413793%
## condition6C-condition6A
## 0%
## 0%
## 0%
## condition6C-condition6A
## 13.6363636363636%
## 22.2222222222222%
## 30%
## condition6C-condition6A
## 4.76190476190476%
## 9.09090909090909%
## 13.0434782608696%
## condition6C-condition6A
## 0%
## 0%
## 4.54545454545455%
## condition6C-condition6A
## 3.84615384615385%
## 13.3333333333333%
## 12.9032258064516%
## condition6C-condition6A
## 23.3333333333333%
## 41.025641025641%
## 48.8888888888889%
## condition6C-condition6A
## 0%
## 0%
## 0%
## condition6C-condition6A
## 10%
## 28.2051282051282%
## 37.7777777777778%
## condition6C-condition6A
## 3.33333333333333%
## 9.375%
## 14.7058823529412%
## condition6C-condition6B
## 0%
## 0%
## 0%
## condition6C-condition6B
## 0%
## 4.76190476190476%
## 8.33333333333333%
## condition6C-condition6B
## 9.52380952380952%
## 13.6363636363636%
## 32.1428571428571%
## condition6C-condition6B
## 0%
## 0%
## 0%
## condition6C-condition6B
## 4%
## 3.7037037037037%
## 7.14285714285714%
## condition6C-condition6B
## 9.375%
## 23.6842105263158%
## 29.2682926829268%
## condition6C-condition6B
## 0%
## 0%
## 0%
## condition6C-condition6B
## 0%
## 0%
## 0%
## condition6C-condition6B
## 0%
## 0%
## 0%
res.list <- list(res.CPTAC.MSstats.full[!is.infinite(res.CPTAC.MSstats.full$log2FC),],
res.CPTAC.MSstats.noimp.full[!is.infinite(res.CPTAC.MSstats.noimp.full$log2FC),],
res.CPTAC.ProPCA.full.QN.int,
res.CPTAC.RegEM.full,
res.CPTAC.co.full,
res.CPTAC.full,
res.CPTAC.Hurdle)
res.list <- lapply(res.list, function(x){
left_join(common.Base, x)
}
)
## Joining, by = c("protein", "UPS", "contrast")
## Warning: Column `protein` joining character vector and factor, coercing
## into character vector
## Joining, by = c("protein", "UPS", "contrast")
## Warning: Column `protein` joining character vector and factor, coercing
## into character vector
## Joining, by = c("protein", "UPS", "contrast")Joining, by = c("protein",
## "UPS", "contrast")Joining, by = c("protein", "UPS", "contrast")
## Warning: Column `protein` joining character vector and factor, coercing
## into character vector
## Joining, by = c("protein", "UPS", "contrast")Joining, by = c("protein",
## "UPS", "contrast")
## Warning: Column `protein` joining character vector and factor, coercing
## into character vector
contrast.vec <- c("condition6B-condition6A",
"condition6C-condition6A",
"condition6C-condition6B")
names(contrast.vec) <- c("Condition 6B vs. 6A",
"Condition 6C vs. 6A",
"Condition 6C vs. 6B")
colors <- c("#808000", "green", "red2", "black", "#E15E9E", "#FF681E", "#5A2A82") # darkblue: "#1B2944" gray: "#50FF00"
sort.list <- list(c("adj.pvalue", "pvalue", "Tvalue", "log2FC"),
c("adj.pvalue", "pvalue", "Tvalue", "log2FC"),
c("adj.P.Val", "P.Value", "t", "logFC"),
c("qvalue", "pvalue", "t", "logFC"),
c("qvalue", "pvalue", "t", "logOR"),
c("qvalue", "pvalue", "t", "logFC"),
c("qchisq", "pchisq", "chisq", NA))
PlotFDPTPR(res.list, contrast.vec, colors, sort.list, TPcol = c("UPS"), plotSVG = FALSE) # TRUE
## condition6B-condition6A
## 0%
## 5.55555555555556%
## 25%
## condition6B-condition6A
## 0%
## 4%
## 13.7931034482759%
## condition6B-condition6A
## 0%
## 0%
## 0%
## condition6B-condition6A
## 0%
## 0%
## 0%
## condition6B-condition6A
## 0%
## 0%
## 0%
## condition6B-condition6A
## 3.57142857142857%
## 6.89655172413793%
## 12.9032258064516%
## condition6B-condition6A
## 0%
## 3.57142857142857%
## 6.89655172413793%
## condition6C-condition6A
## 0%
## 24.3243243243243%
## 27.5%
## condition6C-condition6A
## 6.45161290322581%
## 16.6666666666667%
## 23.0769230769231%
## condition6C-condition6A
## 0%
## 0%
## 4.54545454545455%
## condition6C-condition6A
## 0%
## 0%
## 0%
## condition6C-condition6A
## 0%
## 0%
## 0%
## condition6C-condition6A
## 10%
## 28.2051282051282%
## 37.7777777777778%
## condition6C-condition6A
## 3.33333333333333%
## 9.375%
## 14.7058823529412%
## condition6C-condition6B
## 0%
## 0%
## 4.16666666666667%
## condition6C-condition6B
## 0%
## 3.57142857142857%
## 6.45161290322581%
## condition6C-condition6B
## 0%
## 0%
## 0%
## condition6C-condition6B
## 0%
## 0%
## 0%
## condition6C-condition6B
## 0%
## 0%
## 0%
## condition6C-condition6B
## 0%
## 0%
## 0%
## condition6C-condition6B
## 0%
## 0%
## 0%
Aditionally, we here provide the plots for the underlying (preprocessed) peptide-level data for the 1500 most significant proteins for each method in the HEART dataset, grouped per method.
### First 1500 most significant genes ###
# A. Select top 1500 DA gene identifiers for the Perseus analysis of ventricles vs. atria
genes.Perseus.AvsV.1500 <- (Perseus.AvsV.ov %>% pull(`Gene names`) %>% unique)[1:1500]
# B. Select top 1500 DA gene identifiers for the Hurdle analysis of atria vs. ventricles
genes.Hurdle.AvsV.1500 <- hurdle.AvsV.ov[1:1500,]$gene.name
# C. Select top 1500 DA gene identifiers for the MSqRob analysis of atria vs. ventricles
genes.MSqRob.AvsV.1500 <- MSqRob.AvsV.ov[1:1500,]$gene.name
### Overlap all (Supplementary Fig. 8) ###
plot_proteins(gene.names = "MYL7", title = "MYL7", msnset = peptides.HEART2, pdf = FALSE)
plot_proteins(gene.names = "PAM", title = "PAM", msnset = peptides.HEART2, pdf = FALSE)
plot_proteins(gene.names = "MYBPHL", title = "MYBPHL", msnset = peptides.HEART2, pdf = FALSE)
plot_proteins(gene.names = "MAP1B", title = "MAP1B", msnset = peptides.HEART2, pdf = FALSE)
# plot_proteinsSVG(gene.names = c("MYL7", "PAM", "MYBPHL", "MAP1B"), title = c("overlap_all.svg"), msnset = peptides.HEART2)
# Order p-values by evidence from all three methods
genes <- genes.MSqRob.AvsV.1500[(genes.MSqRob.AvsV.1500 %in% genes.Perseus.AvsV.1500) & (genes.MSqRob.AvsV.1500 %in% genes.Hurdle.AvsV.1500)] %>% as.character
Mdata <- MSqRob.AvsV.ov[MSqRob.AvsV.ov$gene.name %in% genes,]
Pdata <- Perseus.AvsV.ov[Perseus.AvsV.ov$`Gene names` %in% genes,]
Hdata <- hurdle.AvsV.ov[hurdle.AvsV.ov$gene.name %in% genes,]
MSqRob.pvals <- Mdata$pvalue
names(MSqRob.pvals) <- Mdata$gene.name
MSqRob.pvals <- MSqRob.pvals[genes]
MSqRob.pvals[is.na(MSqRob.pvals)] <- 1
Perseus.pvals <- 10^(-Pdata$`minusLOG(P-value)`)
names(Perseus.pvals) <- Pdata$`Gene names`
Perseus.pvals <- Perseus.pvals[genes]
Perseus.pvals[is.na(Perseus.pvals)] <- 1
hurdle.pvals <- Hdata$pchisq
names(hurdle.pvals) <- Hdata$gene.name
hurdle.pvals <- hurdle.pvals[genes]
hurdle.pvals[is.na(hurdle.pvals)] <- 1
genes.ordered <- qnorm(MSqRob.pvals/2)^2+qnorm(Perseus.pvals/2)^2+qnorm(hurdle.pvals/2)^2
genes.ordered <- sort(genes.ordered, decreasing = TRUE)
# plot_proteins(gene.names = names(genes.ordered), title = "overlap_all.pdf", msnset = peptides.HEART2, pdf = TRUE)
### Overlap Hurdle and Perseus (Supplementary Fig. 9) ###
plot_proteins(gene.names = "CA13", title = "CA13", msnset = peptides.HEART2, pdf = FALSE)
plot_proteins(gene.names = "NTN1", title = "NTN1", msnset = peptides.HEART2, pdf = FALSE)
plot_proteins(gene.names = "PRKG2", title = "PRKG2", msnset = peptides.HEART2, pdf = FALSE)
plot_proteins(gene.names = "SBF2", title = "SBF2", msnset = peptides.HEART2, pdf = FALSE)
# plot_proteinsSVG(gene.names = c("CA13", "NTN1", "PRKG2", "SBF2"), title = c("overlap_hurdle_Perseus.svg"), msnset = peptides.HEART2)
# Order p-values by evidence from Hurdle and Perseus and against MSqRob
genes <- genes.Hurdle.AvsV.1500[(genes.Hurdle.AvsV.1500 %in% genes.Perseus.AvsV.1500) & !(genes.Hurdle.AvsV.1500 %in% genes.MSqRob.AvsV.1500)] %>% as.character
Mdata <- MSqRob.AvsV.ov[MSqRob.AvsV.ov$gene.name %in% genes,]
Pdata <- Perseus.AvsV.ov[Perseus.AvsV.ov$`Gene names` %in% genes,]
Hdata <- hurdle.AvsV.ov[hurdle.AvsV.ov$gene.name %in% genes,]
MSqRob.pvals <- Mdata$pvalue
names(MSqRob.pvals) <- Mdata$gene.name
MSqRob.pvals <- MSqRob.pvals[genes]
MSqRob.pvals[is.na(MSqRob.pvals)] <- 1
Perseus.pvals <- 10^(-Pdata$`minusLOG(P-value)`)
names(Perseus.pvals) <- Pdata$`Gene names`
Perseus.pvals <- Perseus.pvals[genes]
Perseus.pvals[is.na(Perseus.pvals)] <- 1
hurdle.pvals <- Hdata$pchisq
names(hurdle.pvals) <- Hdata$gene.name
hurdle.pvals <- hurdle.pvals[genes]
hurdle.pvals[is.na(hurdle.pvals)] <- 1
genes.ordered <- qnorm((1-MSqRob.pvals)/2)^2+qnorm(Perseus.pvals/2)^2+qnorm(hurdle.pvals/2)^2
genes.ordered <- sort(genes.ordered, decreasing = TRUE)
# plot_proteins(gene.names = names(genes.ordered), title = "overlap_hurdle_Perseus.pdf", msnset = peptides.HEART2, pdf = TRUE)
### Hurdle alone (Supplementary Fig. 10) ###
plot_proteins(gene.names = "HTT", title = "HTT", msnset = peptides.HEART2, pdf = FALSE)
plot_proteins(gene.names = "ACACA", title = "ACACA", msnset = peptides.HEART2, pdf = FALSE)
plot_proteins(gene.names = "ABCA6", title = "ABCA6", msnset = peptides.HEART2, pdf = FALSE)
plot_proteins(gene.names = "FTCD", title = "FTCD", msnset = peptides.HEART2, pdf = FALSE)
# plot_proteinsSVG(gene.names = c("HTT", "ACACA", "ABCA6", "FTCD"), title = c("only_hurdle.svg"), msnset = peptides.HEART2)
# Order p-values by evidence from Hurdle and against MSqRob and Perseus
genes <- genes.Hurdle.AvsV.1500[!(genes.Hurdle.AvsV.1500 %in% genes.MSqRob.AvsV.1500) & !(genes.Hurdle.AvsV.1500 %in% genes.Perseus.AvsV.1500)] %>% as.character
Mdata <- MSqRob.AvsV.ov[MSqRob.AvsV.ov$gene.name %in% genes,]
Pdata <- Perseus.AvsV.ov[Perseus.AvsV.ov$`Gene names` %in% genes,]
Hdata <- hurdle.AvsV.ov[hurdle.AvsV.ov$gene.name %in% genes,]
MSqRob.pvals <- Mdata$pvalue
names(MSqRob.pvals) <- Mdata$gene.name
MSqRob.pvals <- MSqRob.pvals[genes]
MSqRob.pvals[is.na(MSqRob.pvals)] <- 1
Perseus.pvals <- 10^(-Pdata$`minusLOG(P-value)`)
names(Perseus.pvals) <- Pdata$`Gene names`
Perseus.pvals <- Perseus.pvals[genes]
Perseus.pvals[is.na(Perseus.pvals)] <- 1
hurdle.pvals <- Hdata$pchisq
names(hurdle.pvals) <- Hdata$gene.name
hurdle.pvals <- hurdle.pvals[genes]
hurdle.pvals[is.na(hurdle.pvals)] <- 1
genes.ordered <- qnorm((1-MSqRob.pvals)/2)^2+qnorm((1-Perseus.pvals)/2)^2+qnorm(hurdle.pvals/2)^2
genes.ordered <- sort(genes.ordered, decreasing = TRUE)
# plot_proteins(gene.names = names(genes.ordered), title = "only_hurdle.pdf", msnset = peptides.HEART2, pdf = TRUE)
### Perseus alone (Supplementary Fig. 11) ###
plot_proteins(gene.names = "GJA5", title = "GJA5", msnset = peptides.HEART2, pdf = FALSE)
plot_proteins(gene.names = "ASNSD1", title = "ASNSD1", msnset = peptides.HEART2, pdf = FALSE)
plot_proteins(gene.names = "MTCL1", title = "MTCL1", msnset = peptides.HEART2, pdf = FALSE)
plot_proteins(gene.names = "BLOC1S5;BLOC1S5-TXNDC5", title = "BLOC1S5;BLOC1S5-TXNDC5", msnset = peptides.HEART2, pdf = FALSE)
# plot_proteinsSVG(gene.names = c("GJA5", "ASNSD1", "MTCL1", "BLOC1S5;BLOC1S5-TXNDC5"), title = c("only_Perseus.svg"), msnset = peptides.HEART2)
# Order p-values by evidence from Perseus and against Hurdle and MSqRob
genes <- genes.Perseus.AvsV.1500[!(genes.Perseus.AvsV.1500 %in% genes.MSqRob.AvsV.1500) & !(genes.Perseus.AvsV.1500 %in% genes.Hurdle.AvsV.1500)] %>% as.character
Mdata <- MSqRob.AvsV.ov[MSqRob.AvsV.ov$gene.name %in% genes,]
Pdata <- Perseus.AvsV.ov[Perseus.AvsV.ov$`Gene names` %in% genes,]
Hdata <- hurdle.AvsV.ov[hurdle.AvsV.ov$gene.name %in% genes,]
MSqRob.pvals <- Mdata$pvalue
names(MSqRob.pvals) <- Mdata$gene.name
MSqRob.pvals <- MSqRob.pvals[genes]
MSqRob.pvals[is.na(MSqRob.pvals)] <- 1
Perseus.pvals <- 10^(-Pdata$`minusLOG(P-value)`)
names(Perseus.pvals) <- Pdata$`Gene names`
Perseus.pvals <- Perseus.pvals[genes]
Perseus.pvals[is.na(Perseus.pvals)] <- 1
hurdle.pvals <- Hdata$pchisq
names(hurdle.pvals) <- Hdata$gene.name
hurdle.pvals <- hurdle.pvals[genes]
hurdle.pvals[is.na(hurdle.pvals)] <- 1
genes.ordered <- qnorm((1-MSqRob.pvals)/2)^2+qnorm(Perseus.pvals/2)^2+qnorm((1-hurdle.pvals)/2)^2
genes.ordered <- sort(genes.ordered, decreasing = TRUE)
# plot_proteins(gene.names = names(genes.ordered), title = "only_Perseus.pdf", msnset = peptides.HEART2, pdf = TRUE)
### Overlap MSqRob and Perseus (Supplementary Fig. 12) ###
plot_proteins(gene.names = "AK4", title = "AK4", msnset = peptides.HEART2, pdf = FALSE)
plot_proteins(gene.names = "NDUFV1", title = "NDUFV1", msnset = peptides.HEART2, pdf = FALSE)
plot_proteins(gene.names = "DECR1", title = "DECR1", msnset = peptides.HEART2, pdf = FALSE)
plot_proteins(gene.names = "SDHA", title = "SDHA", msnset = peptides.HEART2, pdf = FALSE)
# plot_proteinsSVG(gene.names = c("AK4", "NDUFV1", "DECR1", "SDHA"), title = c("overlap_MSqRob_Perseus.svg"), msnset = peptides.HEART2)
# Order p-values by evidence from MSqRob and Perseus and against Hurdle
genes <- genes.MSqRob.AvsV.1500[(genes.MSqRob.AvsV.1500 %in% genes.Perseus.AvsV.1500) & !(genes.MSqRob.AvsV.1500 %in% genes.Hurdle.AvsV.1500)] %>% as.character
Mdata <- MSqRob.AvsV.ov[MSqRob.AvsV.ov$gene.name %in% genes,]
Pdata <- Perseus.AvsV.ov[Perseus.AvsV.ov$`Gene names` %in% genes,]
Hdata <- hurdle.AvsV.ov[hurdle.AvsV.ov$gene.name %in% genes,]
MSqRob.pvals <- Mdata$pvalue
names(MSqRob.pvals) <- Mdata$gene.name
MSqRob.pvals <- MSqRob.pvals[genes]
MSqRob.pvals[is.na(MSqRob.pvals)] <- 1
Perseus.pvals <- 10^(-Pdata$`minusLOG(P-value)`)
names(Perseus.pvals) <- Pdata$`Gene names`
Perseus.pvals <- Perseus.pvals[genes]
Perseus.pvals[is.na(Perseus.pvals)] <- 1
hurdle.pvals <- Hdata$pchisq
names(hurdle.pvals) <- Hdata$gene.name
hurdle.pvals <- hurdle.pvals[genes]
hurdle.pvals[is.na(hurdle.pvals)] <- 1
genes.ordered <- qnorm(MSqRob.pvals/2)^2+qnorm(Perseus.pvals/2)^2+qnorm((1-hurdle.pvals)/2)^2
genes.ordered <- sort(genes.ordered, decreasing = TRUE)
# plot_proteins(gene.names = names(genes.ordered), title = "overlap_MSqRob_Perseus.pdf", msnset = peptides.HEART2, pdf = TRUE)
### MSqRob alone ###
# Order p-values by evidence from MSqRob and against Hurdle and Perseus
genes <- genes.MSqRob.AvsV.1500[!(genes.MSqRob.AvsV.1500 %in% genes.Perseus.AvsV.1500) & !(genes.MSqRob.AvsV.1500 %in% genes.Hurdle.AvsV.1500)] %>% as.character
Mdata <- MSqRob.AvsV.ov[MSqRob.AvsV.ov$gene.name %in% genes,]
Pdata <- Perseus.AvsV.ov[Perseus.AvsV.ov$`Gene names` %in% genes,]
Hdata <- hurdle.AvsV.ov[hurdle.AvsV.ov$gene.name %in% genes,]
MSqRob.pvals <- Mdata$pvalue
names(MSqRob.pvals) <- Mdata$gene.name
MSqRob.pvals <- MSqRob.pvals[genes]
MSqRob.pvals[is.na(MSqRob.pvals)] <- 1
Perseus.pvals <- 10^(-Pdata$`minusLOG(P-value)`)
names(Perseus.pvals) <- Pdata$`Gene names`
Perseus.pvals <- Perseus.pvals[genes]
Perseus.pvals[is.na(Perseus.pvals)] <- 1
hurdle.pvals <- Hdata$pchisq
names(hurdle.pvals) <- Hdata$gene.name
hurdle.pvals <- hurdle.pvals[genes]
hurdle.pvals[is.na(hurdle.pvals)] <- 1
genes.ordered <- qnorm(MSqRob.pvals/2)^2+qnorm((1-Perseus.pvals)/2)^2+qnorm((1-hurdle.pvals)/2)^2
genes.ordered <- sort(genes.ordered, decreasing = TRUE)
# plot_proteins(gene.names = names(genes.ordered), title = "only_MSqRob.pdf", msnset = peptides.HEART2, pdf = TRUE)
### Overlap MSqRob and Hurdle ###
# Order p-values by evidence from MSqRob and Hurdle and against Perseus
genes <- genes.MSqRob.AvsV.1500[(genes.MSqRob.AvsV.1500 %in% genes.Hurdle.AvsV.1500) & !(genes.MSqRob.AvsV.1500 %in% genes.Perseus.AvsV.1500)] %>% as.character
Mdata <- MSqRob.AvsV.ov[MSqRob.AvsV.ov$gene.name %in% genes,]
Pdata <- Perseus.AvsV.ov[Perseus.AvsV.ov$`Gene names` %in% genes,]
Hdata <- hurdle.AvsV.ov[hurdle.AvsV.ov$gene.name %in% genes,]
MSqRob.pvals <- Mdata$pvalue
names(MSqRob.pvals) <- Mdata$gene.name
MSqRob.pvals <- MSqRob.pvals[genes]
MSqRob.pvals[is.na(MSqRob.pvals)] <- 1
Perseus.pvals <- 10^(-Pdata$`minusLOG(P-value)`)
names(Perseus.pvals) <- Pdata$`Gene names`
Perseus.pvals <- Perseus.pvals[genes]
Perseus.pvals[is.na(Perseus.pvals)] <- 1
hurdle.pvals <- Hdata$pchisq
names(hurdle.pvals) <- Hdata$gene.name
hurdle.pvals <- hurdle.pvals[genes]
hurdle.pvals[is.na(hurdle.pvals)] <- 1
genes.ordered <- qnorm(MSqRob.pvals/2)^2+qnorm((1-Perseus.pvals)/2)^2+qnorm(hurdle.pvals/2)^2
genes.ordered <- sort(genes.ordered, decreasing = TRUE)
# plot_proteins(gene.names = names(genes.ordered), title = "overlap_MSqRob_Hurdle.pdf", msnset = peptides.HEART2, pdf = TRUE)
### Venn diagrams atrial vs. ventricular region ###
### Checks: ###
all(Perseus.AvsV.ov$`Gene names` %in% hurdle.AvsV.ov$gene.name)
## [1] TRUE
all(Perseus.AvsV.ov$`Gene names` %in% MSqRob.AvsV.ov$gene.name)
## [1] TRUE
all(hurdle.AvsV.ov$gene.name %in% Perseus.AvsV.ov$`Gene names`)
## [1] TRUE
all(MSqRob.AvsV.ov$gene.name %in% Perseus.AvsV.ov$`Gene names`)
## [1] TRUE
dim(MSqRob.AvsV.ov)
## [1] 7822 9
dim(hurdle.AvsV.ov)
## [1] 7822 8
length(unique(Perseus.AvsV.ov$`Gene names`))
## [1] 7822
################
### Top: the first 500 significant genes ###
# A. Select top 500 DA gene identifiers for the Perseus analysis of ventricles vs. atria
genes.Perseus.AvsV.500 <- (Perseus.AvsV.ov %>% pull(`Gene names`) %>% unique)[1:500]
# B. Select top 500 DA gene identifiers for the Hurdle analysis of atria vs. ventricles
genes.Hurdle.AvsV.500 <- hurdle.AvsV.ov[1:500,]$gene.name
# C. Select top 500 DA gene identifiers for the MSqRob analysis of atria vs. ventricles
genes.MSqRob.AvsV.500 <- MSqRob.AvsV.ov[1:500,]$gene.name
# MSqRob alone
sum(!(genes.MSqRob.AvsV.500 %in% genes.Hurdle.AvsV.500) & !(genes.MSqRob.AvsV.500 %in% genes.Perseus.AvsV.500))
## [1] 99
# 99
# Hurdle alone
sum(!(genes.Hurdle.AvsV.500 %in% genes.MSqRob.AvsV.500) & !(genes.Hurdle.AvsV.500 %in% genes.Perseus.AvsV.500))
## [1] 85
# 85
# Perseus alone
sum(!(genes.Perseus.AvsV.500 %in% genes.MSqRob.AvsV.500) & !(genes.Perseus.AvsV.500 %in% genes.Hurdle.AvsV.500))
## [1] 218
# 218
# Overlap MSqRob and Hurdle
sum((genes.MSqRob.AvsV.500 %in% genes.Hurdle.AvsV.500) & !(genes.MSqRob.AvsV.500 %in% genes.Perseus.AvsV.500))
## [1] 158
# 158
# Overlap MSqRob and Perseus
sum((genes.MSqRob.AvsV.500 %in% genes.Perseus.AvsV.500) & !(genes.MSqRob.AvsV.500 %in% genes.Hurdle.AvsV.500))
## [1] 25
# 25
# Overlap Hurdle and Perseus
sum((genes.Hurdle.AvsV.500 %in% genes.Perseus.AvsV.500) & !(genes.Hurdle.AvsV.500 %in% genes.MSqRob.AvsV.500))
## [1] 39
# 39
# Overlap everything
sum((genes.MSqRob.AvsV.500 %in% genes.Perseus.AvsV.500) & (genes.MSqRob.AvsV.500 %in% genes.Hurdle.AvsV.500))
## [1] 218
# 218
# Control:
length(genes.MSqRob.AvsV.500)
## [1] 500
99+218+158+25
## [1] 500
length(genes.Hurdle.AvsV.500)
## [1] 500
85+158+218+39
## [1] 500
length(genes.Perseus.AvsV.500)
## [1] 500
218+39+25+218
## [1] 500
### Bottom: the first 1000 significant genes ###
# A. Select top 1000 DA gene identifiers for the Perseus analysis of ventricles vs. atria
genes.Perseus.AvsV.1000 <- (Perseus.AvsV.ov %>% pull(`Gene names`) %>% unique)[1:1000]
# B. Select top 1000 DA gene identifiers for the Hurdle analysis of atria vs. ventricles
genes.Hurdle.AvsV.1000 <- hurdle.AvsV.ov[1:1000,]$gene.name
# C. Select top 1000 DA gene identifiers for the MSqRob analysis of atria vs. ventricles
genes.MSqRob.AvsV.1000 <- MSqRob.AvsV.ov[1:1000,]$gene.name
# MSqRob alone
sum(!(genes.MSqRob.AvsV.1000 %in% genes.Hurdle.AvsV.1000) & !(genes.MSqRob.AvsV.1000 %in% genes.Perseus.AvsV.1000))
## [1] 174
# 174
# Hurdle alone
sum(!(genes.Hurdle.AvsV.1000 %in% genes.MSqRob.AvsV.1000) & !(genes.Hurdle.AvsV.1000 %in% genes.Perseus.AvsV.1000))
## [1] 134
# 134
# Perseus alone
sum(!(genes.Perseus.AvsV.1000 %in% genes.MSqRob.AvsV.1000) & !(genes.Perseus.AvsV.1000 %in% genes.Hurdle.AvsV.1000))
## [1] 439
# 439
# Overlap MSqRob and Hurdle
sum((genes.MSqRob.AvsV.1000 %in% genes.Hurdle.AvsV.1000) & !(genes.MSqRob.AvsV.1000 %in% genes.Perseus.AvsV.1000))
## [1] 363
# 363
# Overlap MSqRob and Perseus
sum((genes.MSqRob.AvsV.1000 %in% genes.Perseus.AvsV.1000) & !(genes.MSqRob.AvsV.1000 %in% genes.Hurdle.AvsV.1000))
## [1] 58
# 58
# Overlap Hurdle and Perseus
sum((genes.Hurdle.AvsV.1000 %in% genes.Perseus.AvsV.1000) & !(genes.Hurdle.AvsV.1000 %in% genes.MSqRob.AvsV.1000))
## [1] 98
# 98
# Overlap everything
sum((genes.MSqRob.AvsV.1000 %in% genes.Perseus.AvsV.1000) & (genes.MSqRob.AvsV.1000 %in% genes.Hurdle.AvsV.1000))
## [1] 405
# 405
# Control:
length(genes.MSqRob.AvsV.1000)
## [1] 1000
174+363+58+405
## [1] 1000
length(genes.Hurdle.AvsV.1000)
## [1] 1000
134+363+98+405
## [1] 1000
length(genes.Perseus.AvsV.1000)
## [1] 1000
439+58+98+405
## [1] 1000